from transformers import AutoTokenizer
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

model_path = r"F:\models\rbt3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
x = tokenizer.batch_encode_plus(
    ['明月装饰了你的窗子', '你装饰了别人的梦'],
    truncation=True,
)
print(x)

from datasets import load_from_disk
dataset = load_from_disk(r"F:\dataset\ChnSentiCorp\save")
dataset['train'] = dataset['train'].shuffle().select(range(2000))
dataset['test'] = dataset['test'].shuffle().select(range(100))
dataset['validation'] = dataset['validation'].shuffle().select(range(200))

def f(data):
    return tokenizer.batch_encode_plus(data['text'], truncation=True)


dataset = dataset.map(
    f,
    batched=True,
    batch_size=1000,
    num_proc=1,
    remove_columns=['text']
)

print(dataset)

